Emile Cohen
June 2020
Goal: In this notebook, we want to understand what makes Colorectal Cancer a textbook case for the patterns we saw, and what are the major subcohorts that drive the signal.
%run -i '../../../../../utils/setup_environment.ipy'
import warnings
warnings.filterwarnings('ignore')
from scipy.stats import fisher_exact, ranksums, chi2, norm
from statsmodels.sandbox.stats.multicomp import multipletests
import matplotlib.gridspec as gridspec
import pickle
data_path = '../../../../../data/'
data_wgd = data_path + 'impact-facets-tp53/processed/wgd/'
data_no_wgd = data_path + 'impact-facets-tp53/processed/no_wgd/'
from functools import reduce
def get_hotspots(df: pd.DataFrame, Sample_Type: str, group: list = None, group_type:str = None):
data = df[df['Sample_Type'] == Sample_Type]
if group and group_type:
data = data[data[group_type].isin(group)]
data_1 = get_groupby(data,'tp53_spot_1', 'count'); data_2 = get_groupby(data,'tp53_spot_2', 'count'); data_3 = get_groupby(data,'tp53_spot_3', 'count') ; data_4 = get_groupby(data,'tp53_spot_4', 'count') ; data_5 = get_groupby(data,'tp53_spot_5', 'count')
series_data = [data_1,data_2,data_3,data_4,data_5]
df_merged = reduce(lambda left,right: pd.merge(left,right,left_index=True, right_index=True,
how='outer'), series_data).fillna(0)
df_merged.columns = ['count_1', 'count_2', 'count_3', 'count_4', 'count_5']
df_merged['total'] = df_merged.sum(axis=1)
df_merged = df_merged.sort_values(by='total', ascending=False)
df_merged = df_merged.drop('nan')
return df_merged
def get_hotspot_frac(df: pd.DataFrame, group_type:str = None, group: list = None, nb = 10):
if group_type and group:
df = df[df[group_type].isin(group)]
result = [['spot', '#', 'frac']]
for spot in get_groupby(df, 'tp53_spot_1', 'count').sort_values(by='count', ascending=False).head(nb).index.tolist():
result.append([spot,df[df['tp53_spot_1'] == spot].frac_genome_altered.shape[0], df[df['tp53_spot_1'] == spot].frac_genome_altered.median()])
return pd.DataFrame(result)
def boxplot_sampletype(df: pd.DataFrame, group:str, palette, order, metrics: str, figsize= (10,3), title: str = '', title_font: int=12, xlim=[0,1]):
fig=plt.figure(figsize=figsize)
ax = plt.subplot2grid(shape=(2,1), loc=(0,0), colspan=1)
sns.boxplot(y=metrics, x=group,data=df,ax=ax, dodge=False,order=order, palette=palette).set_title(title, weight='bold', fontsize=title_font)
groupby_ = get_groupby(df,group, 'count')
groupby_ = groupby_.T
for mut in mutation_list:
if mut not in groupby_.columns:
groupby_[mut] = 0
groupby_ = groupby_.T
labels = []
for element in order:
labels.append(element + '\n('+ str(groupby_.loc[element].values[0])+')')
ax.set_xticklabels(labels)
style(ax)
ax.set_ylim(xlim)
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
return fig, ax
# Let's give a look at medians and statistics
def get_statistics(df: pd.DataFrame, group:str, metrics: str, group_list: list):
group_1 = df[df[group] == group_list[0]][metrics]
group_2 = df[df[group] == group_list[1]][metrics]
median_1 = group_1.median()
median_2 = group_2.median()
statistic, p_value = ranksums(group_1.dropna().values,group_2.dropna().values)
results = [['', 'size', metrics],
[group_list[0], group_1.shape[0], median_1],
[group_list[1], group_2.shape[0], median_2],
['', 'Statistics', 'p-value'],
['', statistic, p_value]]
return pd.DataFrame(results)
def get_major_codrivers(master: pd.DataFrame, maf: pd.DataFrame, head:int = 10, tp53=False):
samples = master.Tumor_Id.tolist()
if tp53:
maf_filtered = maf[maf.Tumor_Sample_Barcode.isin(samples)][maf['driver'] == True]
else:
maf_filtered = maf[maf.Tumor_Sample_Barcode.isin(samples)][maf['driver'] == True][maf['Hugo_Symbol'] != 'TP53']
h = pd.DataFrame(maf_filtered[['Hugo_Symbol']].groupby(['Hugo_Symbol']).size())
h.columns = ['count']
h = h.sort_values(by='count', ascending=False).head(head)
return(h)
def create_co_drivers_table(master: pd.DataFrame, group_type:str, group_1: str, group_2: str):
master_group_1 = master[master[group_type] == group_1]
co_drivers_group_1 = get_major_codrivers(master=master_group_1,
maf=maf_cohort_nowgd,
head=100)
co_drivers_group_1['proportion_1'] = co_drivers_group_1.apply(lambda x: 100* round(x['count'] / co_drivers_group_1.sum().values[0], 4), axis=1)
master_group_2 = master[master[group_type] == group_2]
co_drivers_group_2 = get_major_codrivers(master=master_group_2,
maf=maf_cohort_nowgd,
head=100)
co_drivers_group_2['proportion_2'] = co_drivers_group_2.apply(lambda x: 100* round(x['count'] / co_drivers_group_2.sum().values[0], 4), axis=1)
co_drivers_groups = pd.merge(co_drivers_group_1, co_drivers_group_2, on='Hugo_Symbol')
co_drivers_groups['proportion_1'] = - co_drivers_groups['proportion_1']
return co_drivers_groups
cancer = 'Non-Small Cell Lung Cancer'
master_no_wgd = non_wgd_load_and_cut(data_path + 'impact-facets-tp53/processed/no_wgd/master_no_wgd.pkl')
master_wgd = pd.read_pickle(data_path + 'impact-facets-tp53/processed/wgd/master_wgd.pkl')
master_no_wgd_cancer = master_no_wgd[master_no_wgd['Cancer_Type'] == cancer]
master_wgd_cancer = master_wgd[master_wgd['Cancer_Type'] == cancer]
maf_cohort_nowgd = pd.read_csv(data_path + 'impact-facets-tp53/processed/no_wgd/maf_cohort_nowgd.txt', sep='\t').drop('Unnamed: 0', axis=1)
maf_cohort_wgd = pd.read_csv(data_path + 'impact-facets-tp53/processed/wgd/maf_cohort_wgd.txt', sep='\t').drop('Unnamed: 0', axis=1)
master_wgd[master_wgd['tp53_count'] >=1]
get_groupby(master_wgd[master_wgd['tp53_count'] >=1],'tp53_cn_state', 'count')[:4].sum()
Non-Small Cell Lung Cancer is one of the biggest cancer types in our cohort. It is the fourth cancer in term of WGD proportion with an average proportion of WGD - around 42%


Non-Small Cell Lung Cancer Cancer shows a significant difference in Genome Instability between TP53 Mono-Allelic and Bi-Allelic subgroups - and has a lot of samples in both groups.

In the TP53 subgroup Pan Cancer plot that follows, we can see 3 important signals:


2 main key points:

In this section, our goal is to find subcohorts that lead the signals observed. Here are the different subcohort we will create:
In this section, we cut our cohort to only keep samples with exactly one TP53 mutation, for simplicity.
master_hotspot = master_no_wgd_cancer[master_no_wgd_cancer['tp53_count'] == 1]
get_hotspot_frac(df=master_hotspot,
group_type=None,
group=None)
h = get_groupby(master_hotspot,'tp53_vc_group_1', 'count').sort_values(by='count', ascending=False)
display(h)
h = h.T
h = h[mutation_list]
fig = plt.figure(figsize=(6,1))
ax = plt.subplot()
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
h_plot = h.plot(kind = 'barh', stacked=True, yticks=[], ax=ax, colormap="Accent")
ax.legend(['In Frame', 'Truncated', 'Missense', 'Hotspot 248','Hotspot 273','Hotspot 175', 'Other Hotspot'],loc='center left', bbox_to_anchor=(1.2, 0.5), fontsize=11)
ax.set_title('Mutation Type - {} - No WGD'.format(cancer), weight='bold', fontsize=18)
plt.show()
fig, ax = boxplot_sampletype(df=master_hotspot,
group='tp53_vc_group_1',
palette=mutation_palette,
order=mutation_list,
metrics='frac_genome_altered',
figsize=(6,10),
title='Fraction of Genome Altered - {}'.format(cancer),
xlim=[0,1])
plt.show()
print('Number of Bi Allelic samples (with 1 mut): ' + str(master_hotspot[master_hotspot['tp53_res_group'] == 'no_tp53_res'].shape[0]))
print('')
print('Number of TP53 Residual samples (with 1 mut): ' + str(master_hotspot[master_hotspot['tp53_res_group'] == 'tp53_res'].shape[0]))
total_df = []
for group in ['tp53_res', 'no_tp53_res']:
h = get_groupby(master_hotspot[master_hotspot['tp53_res_group'] == group], 'tp53_vc_group_1', group).sort_values(by=group, ascending=False)
total_df.append(h)
h=h.T
for mut in mutation_list:
if mut not in h.columns:
h[mut] = 0
h = h[mutation_list]
fig = plt.figure(figsize=(6,1))
ax = plt.subplot()
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
h_plot = h.plot(kind = 'barh', stacked=True, yticks=[], ax=ax, colormap="Accent")
if group == 'tp53_res':
ax.legend(['In Frame', 'Truncated', 'Missense', 'Hotspot 248','Hotspot 273','Hotspot 175', 'Other Hotspot'],loc='center left', bbox_to_anchor=(1.05, 0.5), fontsize=11)
else: ax.get_legend().remove()
ax.set_title('Mutation Type - {} - No WGD'.format(group), weight='bold', fontsize=18)
plt.show()
display_side_by_side(total_df[0],total_df[1])
for group in ['tp53_res', 'no_tp53_res']:
master_wt = master_hotspot[master_hotspot['tp53_res_group'] == group]
fig, ax = boxplot_sampletype(df=master_wt,
group='tp53_vc_group_1',
palette=mutation_palette,
order=mutation_list,
metrics='frac_genome_altered',
figsize=(6,10),
title='Fraction of Genome Altered - No WGD - {} subgroup'.format(group),
xlim=[0,1])
plt.show()
In this section we compare SNV and INDEL mutations. As in the previous section, we cut the cohort to keep only samples with exactly 1 tp53 mutation.
fig, ax = boxplot_sampletype(df=master_no_wgd_cancer,
group='tp53_group',
palette=palette,
order=group_list,
metrics='driver_mutation_count',
figsize=(8,12),
title='Driver Mutation Count - TP53 Subroups - No WGD',
xlim=[-0.1,10])
plt.show()
display_side_by_side(get_statistics(df=master_no_wgd_cancer,
group='tp53_group',
metrics='driver_mutation_count',
group_list=['1_WILD_TYPE', '0_HETLOSS']),
get_statistics(df=master_no_wgd_cancer,
group='tp53_group',
metrics='driver_mutation_count',
group_list=['1_WILD_TYPE', '>=1_LOSS']),
get_statistics(df=master_no_wgd_cancer,
group='tp53_group',
metrics='driver_mutation_count',
group_list=['>1muts', '>=1_LOSS']))
fig, ax = boxplot_sampletype(df=master_no_wgd_cancer,
group='tp53_group',
palette=palette,
order=group_list,
metrics='snv_driver_mutation_count',
figsize=(8,12),
title='SNV Driver Mutation Count - TP53 Subroups - No WGD',
xlim=[-0.1,15])
plt.show()
display_side_by_side(get_statistics(df=master_no_wgd_cancer,
group='tp53_group',
metrics='snv_driver_mutation_count',
group_list=['1_WILD_TYPE', '0_HETLOSS']),
get_statistics(df=master_no_wgd_cancer,
group='tp53_group',
metrics='snv_driver_mutation_count',
group_list=['1_WILD_TYPE', '>=1_LOSS']),
get_statistics(df=master_no_wgd_cancer,
group='tp53_group',
metrics='snv_driver_mutation_count',
group_list=['>1muts', '>=1_LOSS']))
fig, ax = boxplot_sampletype(df=master_no_wgd_cancer,
group='tp53_group',
palette=palette,
order=group_list,
metrics='indel_driver_mutation_count',
figsize=(8,12),
title='INDEL Driver Mutation Count - TP53 Subroups - No WGD',
xlim=[-0.1,35])
plt.show()
display_side_by_side(get_statistics(df=master_no_wgd_cancer,
group='tp53_group',
metrics='indel_driver_mutation_count',
group_list=['1_WILD_TYPE', '0_HETLOSS']),
get_statistics(df=master_no_wgd_cancer,
group='tp53_group',
metrics='indel_driver_mutation_count',
group_list=['1_WILD_TYPE', '>=1_LOSS']),
get_statistics(df=master_no_wgd_cancer,
group='tp53_group',
metrics='indel_driver_mutation_count',
group_list=['>1muts', '>=1_LOSS']))
Here, one major information:
The idea here is to see if we have differences in Fraction of Genome Altered if we cut our Cancer cohort on the number of drivers per sample.
Do we have more instability with more INDEL Driver Mutations within the same subgroup?
master_no_wgd_cancer_wt = master_no_wgd_cancer[master_no_wgd_cancer['tp53_group'] == '1_WILD_TYPE']
thr=1
def get_driver_groups(x):
if x.indel_driver_mutation_count > thr:
return 'High Co-Driver Count'
if x.indel_driver_mutation_count <= thr:
return 'Low Co-Driver Count'
master_no_wgd_cancer_wt['co_driver_group'] = master_no_wgd_cancer_wt.apply(get_driver_groups, axis=1)
fig, ax = boxplot_sampletype(df=master_no_wgd_cancer_wt,
group='co_driver_group',
palette={'High Co-Driver Count': '#FF9900' , 'Low Co-Driver Count': '#146EB4'},
order=['High Co-Driver Count', 'Low Co-Driver Count'],
metrics='frac_genome_altered',
figsize=(4,10),
title='Fraction of Genome Altered - 1_WILD_TYPE subgroup - Co Driver Count (thr={}) - {}'.format(thr,cancer),
xlim=[0,1])
plt.show()
get_statistics(df=master_no_wgd_cancer_wt,
group='co_driver_group',
metrics='frac_genome_altered',
group_list=['High Co-Driver Count', 'Low Co-Driver Count'])
master_no_wgd_cancer_het = master_no_wgd_cancer[master_no_wgd_cancer['tp53_group'] == '0_HETLOSS']
thr=1
def get_driver_groups(x):
if x.indel_driver_mutation_count > thr:
return 'High Co-Driver Count'
if x.indel_driver_mutation_count <= thr:
return 'Low Co-Driver Count'
master_no_wgd_cancer_het['co_driver_group'] = master_no_wgd_cancer_het.apply(get_driver_groups, axis=1)
fig, ax = boxplot_sampletype(df=master_no_wgd_cancer_het,
group='co_driver_group',
palette={'High Co-Driver Count': '#FF9900' , 'Low Co-Driver Count': '#146EB4'},
order=['High Co-Driver Count', 'Low Co-Driver Count'],
metrics='frac_genome_altered',
figsize=(4,10),
title='Fraction of Genome Altered - 0_HETLOSS subgroup - Co Driver Count (thr={}) - {}'.format(thr,cancer),
xlim=[0,1])
plt.show()
get_statistics(df=master_no_wgd_cancer_het,
group='co_driver_group',
metrics='frac_genome_altered',
group_list=['High Co-Driver Count', 'Low Co-Driver Count'])
codrivers_cancer = get_major_codrivers(master=master_no_wgd_cancer,
maf=maf_cohort_nowgd,
head=15)
codrivers_cancer_tp53 = get_major_codrivers(master=master_no_wgd_cancer[master_no_wgd_cancer['tp53_count'] >= 1],
maf=maf_cohort_nowgd,
head=15)
co_drivers = pd.merge(codrivers_cancer, codrivers_cancer_tp53, on='Hugo_Symbol')
co_drivers.columns = ['cancer', 'cancer_tp53']
co_drivers['ratio'] = co_drivers.apply(lambda x: 100*round(x.cancer_tp53/x.cancer, 4) , axis=1)
co_drivers = co_drivers.sort_values(by='ratio', ascending=False)
co_drivers
codrivers_cancer
labels = []
codrivers_cancer = get_major_codrivers(master=master_no_wgd_cancer,
maf=maf_cohort_nowgd,
head=15,
tp53=True)
codrivers_cancer['proportion'] = codrivers_cancer.apply(lambda x: 100* round(x['count'] / codrivers_cancer.sum().values[0], 4), axis=1)
for element in codrivers_cancer.head(15).index.tolist():
labels.append(element + ' ('+ str(int(codrivers_cancer.loc[element]['count']))+')')
ax = sns.barplot(y=codrivers_cancer.head(15).index, x='proportion',data=codrivers_cancer.head(15)[['proportion']], color='#7F8C8D', saturation=.2)
ax.set_yticklabels(labels)
ax.set_title('Drivers Frequency in {}'.format(cancer))
labels = []
for element in co_drivers.index.tolist():
labels.append(element + ' ('+ str(int(co_drivers.loc[element]['cancer']))+')')
ax = sns.barplot(y=co_drivers.index, x='ratio',data=co_drivers[['ratio']], color='#7F8C8D', saturation=.2)
ax.set_yticklabels(labels)
ax.set_title('Co-Drivers Enrichment in TP53 State')
co_drivers_res = create_co_drivers_table(master=master_no_wgd_cancer,
group_type='tp53_res_group',
group_1='tp53_res',
group_2='no_tp53_res')
co_drivers_res
fig=plt.figure(figsize=(7,7))
ax = plt.subplot2grid(shape=(2,1), loc=(0,0), colspan=1)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['top'].set_visible(False)
co_drivers_res[['proportion_1', 'proportion_2']].head(10)[::-1].plot.barh(stacked=True, ax=ax, width=1, color = ['#2ECC71','#1E8449'])
ax.legend(['TP53 Residual', 'No TP53 Residual'], fontsize=10)
ax.set_title('Co-Drivers Proportion per TP53 State', fontsize=14)
plt.yticks(fontsize=10)
ax.set_ylabel('')
a=ax.get_xticks().tolist()
a = [20, 15, 10, 5, 0, 5, 10, 15]
ax.set_xticklabels(a, fontsize=10)
plt.grid(b=None)
plt.show()
co_drivers_cnloh_loss = create_co_drivers_table(master=master_no_wgd_cancer,
group_type='tp53_group',
group_1='>=1_cnLOH',
group_2='>=1_LOSS')
co_drivers_cnloh_loss
labels = []
for element in co_drivers_cnloh_loss.head(10).index.tolist():
labels.append(element + ' ('+ str(int(co_drivers_cnloh_loss.loc[element]['count_x']))+')')
ax = sns.barplot(y=co_drivers_cnloh_loss.head(10).index, x='proportion_1',data=co_drivers_cnloh_loss.head(10)[['proportion_1']], color='#7F8C8D', saturation=.2)
ax.set_yticklabels(labels)
ax.set_title('Co-Drivers Frequency in {} - {}'.format(subgroup, cancer))
fig=plt.figure(figsize=(8,8))
ax = plt.subplot2grid(shape=(2,1), loc=(0,0), colspan=1)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['top'].set_visible(False)
co_drivers_cnloh_loss[['proportion_1', 'proportion_2']].head(15)[::-1].plot.barh(stacked=True, ax=ax, width=1, color = [mc[4],mc[0]])
ax.legend(['>=1_cnLOH', '>=1_LOSS'], fontsize=10)
ax.set_title('Co-Drivers Proportion per TP53 State', fontsize=14)
plt.yticks(fontsize=10)
ax.set_ylabel('')
a=ax.get_xticks().tolist()
a = [-10, -7.5, -5, -2.5, 0, 2.5, 5, 7.5, 10, 12.5]
ax.set_xticklabels(a, fontsize=10)
plt.grid(b=None)
plt.show()
co_drivers_losses = create_co_drivers_table(master=master_no_wgd_cancer,
group_type='tp53_group',
group_1='0_HETLOSS',
group_2='>=1_LOSS')
co_drivers_losses
fig=plt.figure(figsize=(7,7))
ax = plt.subplot2grid(shape=(2,1), loc=(0,0), colspan=1)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['top'].set_visible(False)
co_drivers_losses[['proportion_1', 'proportion_2']].head(15)[::-1].plot.barh(stacked=True, ax=ax, width=1, color = [mc[5],mc[0]])
ax.legend(['0_HETLOSS', '>=1_LOSS'], fontsize=10)
ax.set_title('Co-Drivers Proportion per TP53 State', fontsize=14)
plt.yticks(fontsize=10)
ax.set_ylabel('')
a=ax.get_xticks().tolist()
a = [-20, -15, -10, -5, 0, 5, 10, 15]
ax.set_xticklabels(a, fontsize=10)
plt.grid(b=None)
plt.show()
co_drivers_mult_cnloh = create_co_drivers_table(master=master_no_wgd_cancer,
group_type='tp53_group',
group_1='>1muts',
group_2='>=1_cnLOH')
co_drivers_mult_cnloh
get_major_codrivers(master=master_no_wgd_cancer[master_no_wgd_cancer['tp53_group'] == '>1muts'],
maf=maf_cohort_nowgd,
head=100)
fig=plt.figure(figsize=(10,10))
ax = plt.subplot2grid(shape=(2,1), loc=(0,0), colspan=1)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['top'].set_visible(False)
co_drivers_mult_cnloh[['proportion_1', 'proportion_2']].head(20)[::-1].plot.barh(stacked=True, ax=ax, width=1, color = [mc[3],mc[4]])
ax.legend(['>1muts', '>=1_cnLOH'], fontsize=10)
ax.set_title('Co-Drivers Proportion per TP53 State', fontsize=15)
plt.yticks(fontsize=10)
ax.set_ylabel('')
a=ax.get_xticks().tolist()
print(a)
#a = [-20, -10, 0, 10, 20, 30, 40]
#ax.set_xticklabels(a, fontsize=10)
plt.grid(b=None)
plt.show()
co_drivers_wt_loss = create_co_drivers_table(master=master_no_wgd_cancer,
group_type='tp53_group',
group_1='1_WILD_TYPE',
group_2='0_HETLOSS')
co_drivers_wt_loss
fig=plt.figure(figsize=(10,10))
ax = plt.subplot2grid(shape=(2,1), loc=(0,0), colspan=1)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['top'].set_visible(False)
co_drivers_wt_loss[['proportion_1', 'proportion_2']].head(20)[::-1].plot.barh(stacked=True, ax=ax, width=1, color = [mc[2],mc[5]])
ax.legend(['1_WILD_TYPE', '0_HETLOSS'], fontsize=10)
ax.set_title('Co-Drivers Proportion per TP53 State', fontsize=15)
plt.yticks(fontsize=10)
ax.set_ylabel('')
a=ax.get_xticks().tolist()
a = [-20, -15, -10, -5, 0, 5, 10, 15, 20]
ax.set_xticklabels(a, fontsize=10)
plt.grid(b=None)
plt.show()
def get_major_codrivers(master: pd.DataFrame, maf: pd.DataFrame, head:int = 10):
samples = master.Tumor_Id.tolist()
maf_filtered = maf[maf.Tumor_Sample_Barcode.isin(samples)][maf['driver'] == True][maf['Hugo_Symbol'] != 'TP53']
h = pd.DataFrame(maf_filtered[['Hugo_Symbol']].groupby(['Hugo_Symbol']).size())
h.columns = ['count']
h = h.sort_values(by='count', ascending=False).head(head)
return(h)
def create_co_drivers_table_wgd(master_1: pd.DataFrame, master_2: pd.DataFrame, group_type:str, group_1: str):
master_group_1 = master_1[master_1[group_type] == group_1]
co_drivers_group_1 = get_major_codrivers(master=master_group_1,
maf=maf_cohort_nowgd,
head=100)
co_drivers_group_1['proportion_1'] = co_drivers_group_1.apply(lambda x: 100* round(x['count'] / co_drivers_group_1.sum().values[0], 4), axis=1)
master_group_2 = master_2[master_2['tp53_count'] >=1][master_2['tp53_loh_status'] == True]
co_drivers_group_2 = get_major_codrivers(master=master_group_2,
maf=maf_cohort_wgd,
head=100)
co_drivers_group_2['proportion_2'] = co_drivers_group_2.apply(lambda x: 100* round(x['count'] / co_drivers_group_2.sum().values[0], 4), axis=1)
co_drivers_groups = pd.merge(co_drivers_group_2, co_drivers_group_1, on='Hugo_Symbol')
co_drivers_groups['proportion_2'] = - co_drivers_groups['proportion_2']
return co_drivers_groups
co_drivers_wgd_loss = create_co_drivers_table_wgd(master_1=master_no_wgd_cancer,
master_2=master_wgd_cancer,
group_type='tp53_group',
group_1='>=1_LOSS')
co_drivers_wgd_loss
fig=plt.figure(figsize=(8,8))
ax = plt.subplot2grid(shape=(2,1), loc=(0,0), colspan=1)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['top'].set_visible(False)
co_drivers_wgd_loss[['proportion_2', 'proportion_1']].head(15)[::-1].plot.barh(stacked=True, ax=ax, width=1, color = ['#7F8C8D',mc[0]])
ax.legend(['WGD - TP53 - LOH', '>=1_LOSS'], fontsize=10)
ax.set_title('Co-Drivers Proportion per TP53 State', fontsize=14)
plt.yticks(fontsize=10)
ax.set_ylabel('')
a=ax.get_xticks().tolist()
a = [-20, -15, -10, -5, 0, 5, 10, 15]
ax.set_xticklabels(a, fontsize=10)
plt.grid(b=None)
plt.show()
def get_master_codrivers(master: pd.DataFrame, maf: pd.DataFrame, symbol: str):
samples = master.Tumor_Id.tolist()
samples_final = maf[maf.Tumor_Sample_Barcode.isin(samples)][maf['Hugo_Symbol'] == symbol].Tumor_Sample_Barcode.tolist()
master_filtered = master[master.Tumor_Id.isin(samples_final)]
return master_filtered
master_no_wgd_cancer_cnloh = master_no_wgd_cancer[master_no_wgd_cancer['tp53_group'] == '>=1_cnLOH']
master_APC = get_master_codrivers(master=master_no_wgd_cancer_cnloh,
maf=maf_cohort_nowgd,
symbol='APC')
master_KRAS = get_master_codrivers(master=master_no_wgd_cancer_cnloh,
maf=maf_cohort_nowgd,
symbol='KRAS')
master_no_wgd_cancer_cnloh['data'] = '>=1_cnLOH'
master_APC['data'] = 'APC'
master_KRAS['data'] = 'KRAS'
master_PIK3CA['data'] = 'PIK3CA'
masters = [master_no_wgd_cancer_cnloh, master_APC, master_KRAS, master_PIK3CA]
allMasters = pd.concat(masters)
fig=plt.figure(figsize=(5,10))
ax = plt.subplot2grid(shape=(2,1), loc=(0,0), colspan=1)
allMasters[['frac_genome_altered', 'data']].boxplot(by="data", ax=ax)
ax.set_title('Fraction of Genome Altered - >=1_cnLOH')
ax.set_xlabel('')
master_no_wgd_cancer_loss = master_no_wgd_cancer[master_no_wgd_cancer['tp53_group'] == '>=1_LOSS']
master_APC = get_master_codrivers(master=master_no_wgd_cancer_loss,
maf=maf_cohort_nowgd,
symbol='APC')
master_KRAS = get_master_codrivers(master=master_no_wgd_cancer_loss,
maf=maf_cohort_nowgd,
symbol='KRAS')
master_PIK3CA = get_master_codrivers(master=master_no_wgd_cancer_loss,
maf=maf_cohort_nowgd,
symbol='PIK3CA')
master_no_wgd_cancer_loss['data'] = '>=1_loss'
master_APC['data'] = 'APC'
master_KRAS['data'] = 'KRAS'
master_PIK3CA['data'] = 'PIK3CA'
masters = [master_no_wgd_cancer_loss, master_APC, master_KRAS, master_PIK3CA]
allMasters = pd.concat(masters)
fig=plt.figure(figsize=(5,10))
ax = plt.subplot2grid(shape=(2,1), loc=(0,0), colspan=1)
allMasters[['frac_genome_altered', 'data']].boxplot(by="data", ax=ax)
ax.set_title('Fraction of Genome Altered - >=1_LOSS')
ax.set_xlabel('')
master_no_wgd_cancer_muts = master_no_wgd_cancer[master_no_wgd_cancer['tp53_group'] == '>1muts']
master_APC = get_master_codrivers(master=master_no_wgd_cancer_muts,
maf=maf_cohort_nowgd,
symbol='APC')
master_KRAS = get_master_codrivers(master=master_no_wgd_cancer_muts,
maf=maf_cohort_nowgd,
symbol='KRAS')
master_KMT2D = get_master_codrivers(master=master_no_wgd_cancer_muts,
maf=maf_cohort_nowgd,
symbol='KMT2D')
master_RNF43 = get_master_codrivers(master=master_no_wgd_cancer_muts,
maf=maf_cohort_nowgd,
symbol='RNF43')
master_no_wgd_cancer_muts['data'] = '>1muts'
master_APC['data'] = 'APC'
master_KRAS['data'] = 'KRAS'
master_KMT2D['data'] = 'KMT2D'
master_RNF43['data'] = 'RNF43'
masters = [master_no_wgd_cancer_muts, master_APC, master_KRAS, master_KMT2D, master_RNF43]
allMasters = pd.concat(masters)
#### >1mutsfig=plt.figure(figsize=(5,10))
ax = plt.subplot2grid(shape=(2,1), loc=(0,0), colspan=1)
allMasters[['frac_genome_altered', 'data']].boxplot(by="data", ax=ax)
ax.set_title('Fraction of Genome Altered - >1muts')
ax.set_xlabel('')
master_no_wgd_cancer_wt = master_no_wgd_cancer[master_no_wgd_cancer['tp53_group'] == '1_WILD_TYPE']
master_KRAS = get_master_codrivers(master=master_no_wgd_cancer_wt,
maf=maf_cohort_nowgd,
symbol='KRAS')
master_EGFR = get_master_codrivers(master=master_no_wgd_cancer_wt,
maf=maf_cohort_nowgd,
symbol='EGFR')
master_CDKN2A = get_master_codrivers(master=master_no_wgd_cancer_wt,
maf=maf_cohort_nowgd,
symbol='CDKN2A')
master_no_wgd_cancer_wt['data'] = '1_WT'
master_KRAS['data'] = 'KRAS'
master_EGFR['data'] = 'EGFR'
master_CDKN2A['data'] = 'CDKN2A'
masters = [master_no_wgd_cancer_wt, master_EGFR, master_KRAS, master_CDKN2A]
allMasters = pd.concat(masters)
fig=plt.figure(figsize=(5,10))
ax = plt.subplot2grid(shape=(2,1), loc=(0,0), colspan=1)
allMasters[['frac_genome_altered', 'data']].boxplot(by="data", ax=ax)
ax.set_title('Fraction of Genome Altered - 1_WT')
ax.set_xlabel('')
Same here we take only samples with exactly 1 tp53 mutation (master_hotspot).
We have to define groups for CCF to see if there are differences between those groups. To have an idea of the CCF distribution we show here the distribution coming from the cancer_panel.

We see that our tp53_ccf distribution is very high for all subgroups except for 1_WILD_TYPEAdd and >1muts. >=1_LOSS is the biggest subgroup - by far - and has a very high CCF median.
It will be hard to cut the cohort based on the CCF. Let's try and see the size of the subcohorts:
master_ccf = master_no_wgd_cancer[(master_no_wgd_cancer['tp53_count'] == 1) | (master_no_wgd_cancer['tp53_group'] == '0_HETLOSS')]
thr_ccf_1 = 0.8 ; thr_ccf_2 = 0.9
def ccf_subgroup(x):
if x.tp53_ccf_1 <= thr_ccf_1: return 'low'
elif x.tp53_ccf_1 <= thr_ccf_2: return 'medium'
elif x.tp53_ccf_1 > thr_ccf_2: return 'high'
master_ccf['ccf_group'] = master_ccf.apply(ccf_subgroup, axis=1)
get_groupby(master_ccf, 'ccf_group', 'count')
thr_vaf_1 = 0.3 ; thr_vaf_2 = 0.4
def vaf_subgroup(x):
if x.tp53_vaf_1 <= thr_vaf_1: return 'low'
elif x.tp53_vaf_1 <= thr_vaf_2: return 'medium'
elif x.tp53_vaf_1 > thr_vaf_2: return 'high'
master_ccf['vaf_group'] = master_ccf.apply(vaf_subgroup, axis=1)
get_groupby(master_ccf, 'vaf_group', 'count')
fig, ax = boxplot_sampletype(df=master_ccf,
group='tp53_group',
palette=palette,
order=['1_WILD_TYPE','0_HETLOSS', '>=1_LOSS', '>=1_cnLOH'],
metrics='frac_genome_altered',
figsize=(5,10),
title='Fraction of Genome Altered - {}'.format(cancer),
xlim=[0,1])
plt.show()
get_statistics(df=master_ccf,
group='tp53_group',
metrics='frac_genome_altered',
group_list=['0_HETLOSS', '1_WILD_TYPE'])
master_low = master_ccf[(master_ccf['vaf_group'] == 'low') | (master_ccf['tp53_group'] == '0_HETLOSS')]
fig, ax = boxplot_sampletype(df=master_low,
group='tp53_group',
palette=palette,
order=['1_WILD_TYPE','0_HETLOSS', '>=1_LOSS', '>=1_cnLOH'],
metrics='frac_genome_altered',
figsize=(5,10),
title='Fraction of Genome Altered - VAF < {} - {}'.format(thr_vaf_1,cancer),
xlim=[0,1])
plt.show()
get_statistics(df=master_low,
group='tp53_group',
metrics='frac_genome_altered',
group_list=['1_WILD_TYPE', '>=1_LOSS'])
master_med = master_ccf[(master_ccf['vaf_group'] == 'medium') | (master_ccf['tp53_group'] == '0_HETLOSS')]
fig, ax = boxplot_sampletype(df=master_med,
group='tp53_group',
palette=palette,
order=['1_WILD_TYPE', '0_HETLOSS', '>=1_LOSS', '>=1_cnLOH'],
metrics='frac_genome_altered',
figsize=(5,10),
title='Fraction of Genome Altered - {} < VAF < {} - {}'.format(thr_vaf_1,thr_vaf_2,cancer),
xlim=[0,1])
plt.show()
get_statistics(df=master_med,
group='tp53_group',
metrics='frac_genome_altered',
group_list=['1_WILD_TYPE', '0_HETLOSS'])
master_high = master_ccf[(master_ccf['vaf_group'] == 'high') | (master_ccf['tp53_group'] == '0_HETLOSS')]
fig, ax = boxplot_sampletype(df=master_high,
group='tp53_group',
palette=palette,
order=['1_WILD_TYPE', '0_HETLOSS', '>=1_LOSS', '>=1_cnLOH'],
metrics='frac_genome_altered',
figsize=(5,10),
title='Fraction of Genome Altered - VAF > {} - {}'.format(thr_vaf_2,cancer),
xlim=[0,1])
plt.show()
get_statistics(df=master_high,
group='tp53_group',
metrics='frac_genome_altered',
group_list=['1_WILD_TYPE', '0_HETLOSS'])
fig, ax = boxplot_sampletype(df=master_ccf,
group='tp53_group',
palette=palette,
order=['1_WILD_TYPE','0_HETLOSS', '>=1_LOSS', '>=1_cnLOH'],
metrics='frac_genome_altered',
figsize=(5,10),
title='Fraction of Genome Altered - {}'.format(cancer),
xlim=[0,1])
plt.show()
get_statistics(df=master_ccf,
group='tp53_group',
metrics='frac_genome_altered',
group_list=['0_HETLOSS', '1_WILD_TYPE'])
master_low = master_ccf[(master_ccf['ccf_group'] == 'low') | (master_ccf['tp53_group'] == '0_HETLOSS')]
fig, ax = boxplot_sampletype(df=master_low,
group='tp53_group',
palette=palette,
order=['1_WILD_TYPE','0_HETLOSS', '>=1_LOSS', '>=1_cnLOH'],
metrics='frac_genome_altered',
figsize=(5,10),
title='Fraction of Genome Altered - CCF < {} - {}'.format(thr_ccf_1,cancer),
xlim=[0,1])
plt.show()
get_statistics(df=master_low,
group='tp53_group',
metrics='frac_genome_altered',
group_list=['1_WILD_TYPE', '0_HETLOSS'])
master_med = master_ccf[(master_ccf['ccf_group'] == 'medium') | (master_ccf['tp53_group'] == '0_HETLOSS')]
fig, ax = boxplot_sampletype(df=master_med,
group='tp53_group',
palette=palette,
order=['1_WILD_TYPE', '0_HETLOSS', '>=1_LOSS', '>=1_cnLOH'],
metrics='frac_genome_altered',
figsize=(5,10),
title='Fraction of Genome Altered - {} < CCF < {} - {}'.format(thr_ccf_1,thr_ccf_2,cancer),
xlim=[0,1])
plt.show()
get_statistics(df=master_med,
group='tp53_group',
metrics='frac_genome_altered',
group_list=['1_WILD_TYPE', '0_HETLOSS'])
master_high = master_ccf[(master_ccf['ccf_group'] == 'high') | (master_ccf['tp53_group'] == '0_HETLOSS')]
fig, ax = boxplot_sampletype(df=master_high,
group='tp53_group',
palette=palette,
order=['1_WILD_TYPE', '0_HETLOSS', '>=1_LOSS', '>=1_cnLOH'],
metrics='frac_genome_altered',
figsize=(5,10),
title='Fraction of Genome Altered - CCF > {} - {}'.format(thr_ccf_2,cancer),
xlim=[0,1])
plt.show()
get_statistics(df=master_high,
group='tp53_group',
metrics='frac_genome_altered',
group_list=['1_WILD_TYPE', '0_HETLOSS'])
let's check the hotspot distribution
get_hotspot_frac(df=master_high[master_high['tp53_group'] == '1_WILD_TYPE'],
group_type=None,
group=None)
fig, ax = boxplot_sampletype(df=master_hotspot,
group='vaf_group',
palette={'low': tab10[0] , 'medium': tab10[1], 'high':tab10[2]},
order=['low', 'medium', 'high'],
metrics='frac_genome_altered',
figsize=(3,10),
title='Fraction of Genome Altered - VAF levels - {}'.format(cancer),
xlim=[0,1])
plt.show()
get_statistics(df=master_hotspot,
group='vaf_group',
metrics='frac_genome_altered',
group_list=['low', 'medium'])
fig, ax = boxplot_sampletype(df=master_hotspot,
group='ccf_group',
palette={'low': tab10[0] , 'medium': tab10[1], 'high':tab10[2]},
order=['low', 'medium', 'high'],
metrics='frac_genome_altered',
figsize=(3,10),
title='Fraction of Genome Altered - CCF levels - {}'.format(cancer),
xlim=[0,1])
plt.show()
get_statistics(df=master_hotspot,
group='ccf_group',
metrics='frac_genome_altered',
group_list=['low', 'medium'])
#### TP53 Residual Groups#fig=plt.figure(figsize=(10,3))
ax = plt.subplot2grid(shape=(4,1), loc=(0,0), colspan=1)
sns.boxplot(x='Patient_Current_Age',data=master_no_wgd_cancer, ax=ax).set_title('Patient Age - {}'.format(cancer), weight='bold', fontsize=14)
style(ax)
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
fig, ax = boxplot_sampletype(df=master_no_wgd_cancer,
group='tp53_res_group',
palette=palette_res,
order=res_group_list,
metrics='Patient_Current_Age',
figsize=(3,10),
title='Patient Current Age - {}'.format(cancer),
xlim=[20,100])
plt.show()
get_statistics(df=master_no_wgd_cancer,
group='tp53_res_group',
metrics='Patient_Current_Age',
group_list=['tp53_res', 'no_tp53_res'])#### TP53 Subgroups
fig, ax = boxplot_sampletype(df=master_no_wgd_cancer,
group='tp53_group',
palette=palette,
order=group_list,
metrics='Patient_Current_Age',
figsize=(7,10),
title='Patient Current Age - {}'.format(cancer),
xlim=[20,100])
plt.show()
get_statistics(df=master_no_wgd_cancer,
group='tp53_group',
metrics='Patient_Current_Age',
group_list=['1_WILD_TYPE', '>=1_cnLOH'])
h = get_groupby(master_no_wgd_cancer,'Sex', 'count').sort_values(by='count', ascending=False)
display(h)
h = h.T
h = h[['Male', 'Female']]
fig = plt.figure(figsize=(6,1))
ax = plt.subplot()
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
h_plot = h.plot(kind = 'barh', stacked=True, yticks=[], ax=ax)
ax.legend(['Male', 'Female'],loc='center left', bbox_to_anchor=(1.1, 0.5), fontsize=11)
ax.set_title('Sex Distribution - {} - No WGD'.format(cancer), weight='bold', fontsize=18)
plt.show()
from lifelines import KaplanMeierFitter
from lifelines.statistics import logrank_test
data = master_no_wgd_cancer.dropna(subset=['Overall_Survival_Months', 'Overall_Survival_Status'])
data['Overall Survival Status 0/1'] = data.apply(lambda x: 1 if x['Overall_Survival_Status'] == 'DECEASED' else 0, axis=1)
data = data[['tp53_group', 'tp53_res_group', 'Overall Survival Status 0/1', 'Overall_Survival_Months']]
ix1 = data['tp53_res_group'] == 'tp53_res'
ix2 = data['tp53_res_group'] == 'no_tp53_res'
T_exp, E_exp = data.loc[ix1, 'Overall_Survival_Months'], data.loc[ix1, 'Overall Survival Status 0/1']
T_con, E_con = data.loc[ix2, 'Overall_Survival_Months'], data.loc[ix2, 'Overall Survival Status 0/1']
results = logrank_test(T_exp, T_con, event_observed_A=E_exp, event_observed_B=E_con)
results.print_summary()
fig = plt.figure(figsize=(10,7))
ax = fig.add_subplot(111)
fig.suptitle('Survival Analysis - {}'.format(cancer), fontsize=16, weight='bold')
kmf = KaplanMeierFitter()
for group,i in zip(res_group_list, range(len(res_group_list))):
data = master_no_wgd_cancer[master_no_wgd_cancer['tp53_res_group'] == group].dropna(subset=['Overall_Survival_Months', 'Overall_Survival_Status'])
data['Overall Survival Status 0/1'] = data.apply(lambda x: 1 if x['Overall_Survival_Status'] == 'DECEASED' else 0, axis=1)
kmf.fit(np.array(data['Overall_Survival_Months']), event_observed=np.array(data['Overall Survival Status 0/1']), label= group)
kmf.plot_survival_function(color = res_palette_list[i], ax=ax)
plt.show()
fig = plt.figure(figsize=(10,7))
ax = fig.add_subplot(111)
fig.suptitle('Survival Analysis - {}'.format(cancer), fontsize=16, weight='bold')
kmf = KaplanMeierFitter()
for group,i in zip(group_list, range(len(group_list))):
data = master_no_wgd_cancer[master_no_wgd_cancer['tp53_group'] == group].dropna(subset=['Overall_Survival_Months', 'Overall_Survival_Status'])
data['Overall Survival Status 0/1'] = data.apply(lambda x: 1 if x['Overall_Survival_Status'] == 'DECEASED' else 0, axis=1)
kmf.fit(np.array(data['Overall_Survival_Months']), event_observed=np.array(data['Overall Survival Status 0/1']), label= group)
kmf.plot_survival_function(color = palette_list[i], ax=ax)
plt.show()
master_high = master_ccf[(master_ccf['ccf_group'] == 'high') | (master_ccf['tp53_group'] == '0_HETLOSS')]
fig = plt.figure(figsize=(10,7))
ax = fig.add_subplot(111)
fig.suptitle('Survival Analysis - {} - High CCF (CCF>0.9)'.format(cancer), fontsize=16, weight='bold')
kmf = KaplanMeierFitter()
for group,i in zip(res_group_list, range(len(res_group_list))):
data = master_high[master_high['tp53_res_group'] == group].dropna(subset=['Overall_Survival_Months', 'Overall_Survival_Status'])
try:
data['Overall Survival Status 0/1'] = data.apply(lambda x: 1 if x['Overall_Survival_Status'] == 'DECEASED' else 0, axis=1)
kmf.fit(np.array(data['Overall_Survival_Months']), event_observed=np.array(data['Overall Survival Status 0/1']), label= group)
kmf.plot_survival_function(color = res_palette_list[i], ax=ax)
except: pass
plt.show()
fig = plt.figure(figsize=(10,7))
ax = fig.add_subplot(111)
fig.suptitle('Survival Analysis - {} - High CCF (CCF>0.9)'.format(cancer), fontsize=16, weight='bold')
kmf = KaplanMeierFitter()
for group,i in zip(group_list, range(len(group_list))):
data = master_high[master_high['tp53_group'] == group].dropna(subset=['Overall_Survival_Months', 'Overall_Survival_Status'])
try:
data['Overall Survival Status 0/1'] = data.apply(lambda x: 1 if x['Overall_Survival_Status'] == 'DECEASED' else 0, axis=1)
kmf.fit(np.array(data['Overall_Survival_Months']), event_observed=np.array(data['Overall Survival Status 0/1']), label= group)
kmf.plot_survival_function(color = palette_list[i], ax=ax)
except: pass
plt.show()
master_low = master_ccf[(master_ccf['ccf_group'] == 'low') | (master_ccf['tp53_group'] == '0_HETLOSS')]
fig = plt.figure(figsize=(10,7))
ax = fig.add_subplot(111)
fig.suptitle('Survival Analysis - {} - Low CCF (CCF<0.8)'.format(cancer), fontsize=16, weight='bold')
kmf = KaplanMeierFitter()
for group,i in zip(res_group_list, range(len(res_group_list))):
data = master_low[master_low['tp53_res_group'] == group].dropna(subset=['Overall_Survival_Months', 'Overall_Survival_Status'])
try:
data['Overall Survival Status 0/1'] = data.apply(lambda x: 1 if x['Overall_Survival_Status'] == 'DECEASED' else 0, axis=1)
kmf.fit(np.array(data['Overall_Survival_Months']), event_observed=np.array(data['Overall Survival Status 0/1']), label= group)
kmf.plot_survival_function(color = res_palette_list[i], ax=ax)
except: pass
plt.show()
fig = plt.figure(figsize=(10,7))
ax = fig.add_subplot(111)
fig.suptitle('Survival Analysis - {} - Low CCF (CCF<0.8)'.format(cancer), fontsize=16, weight='bold')
kmf = KaplanMeierFitter()
for group,i in zip(group_list, range(len(group_list))):
data = master_low[master_low['tp53_group'] == group].dropna(subset=['Overall_Survival_Months', 'Overall_Survival_Status'])
try:
data['Overall Survival Status 0/1'] = data.apply(lambda x: 1 if x['Overall_Survival_Status'] == 'DECEASED' else 0, axis=1)
kmf.fit(np.array(data['Overall_Survival_Months']), event_observed=np.array(data['Overall Survival Status 0/1']), label= group)
kmf.plot_survival_function(color = palette_list[i], ax=ax)
except: pass
plt.show()
master_EGFR = get_master_codrivers(master=master_no_wgd_cancer,
maf=maf_cohort_nowgd,
symbol='EGFR')
fig = plt.figure(figsize=(10,7))
ax = fig.add_subplot(111)
fig.suptitle('Survival Analysis - {} - EGFR'.format(cancer), fontsize=16, weight='bold')
kmf = KaplanMeierFitter()
for group,i in zip(res_group_list, range(len(res_group_list))):
data = master_EGFR[master_EGFR['tp53_res_group'] == group].dropna(subset=['Overall_Survival_Months', 'Overall_Survival_Status'])
try:
data['Overall Survival Status 0/1'] = data.apply(lambda x: 1 if x['Overall_Survival_Status'] == 'DECEASED' else 0, axis=1)
kmf.fit(np.array(data['Overall_Survival_Months']), event_observed=np.array(data['Overall Survival Status 0/1']), label= group)
kmf.plot_survival_function(color = res_palette_list[i], ax=ax)
except: pass
plt.show()
fig = plt.figure(figsize=(10,7))
ax = fig.add_subplot(111)
fig.suptitle('Survival Analysis - {} - EGFR'.format(cancer), fontsize=16, weight='bold')
kmf = KaplanMeierFitter()
for group,i in zip(group_list, range(len(group_list))):
data = master_EGFR[master_EGFR['tp53_group'] == group].dropna(subset=['Overall_Survival_Months', 'Overall_Survival_Status'])
try:
data['Overall Survival Status 0/1'] = data.apply(lambda x: 1 if x['Overall_Survival_Status'] == 'DECEASED' else 0, axis=1)
kmf.fit(np.array(data['Overall_Survival_Months']), event_observed=np.array(data['Overall Survival Status 0/1']), label= group)
kmf.plot_survival_function(color = palette_list[i], ax=ax)
except: pass
plt.show()
master_KRAS = get_master_codrivers(master=master_no_wgd_cancer,
maf=maf_cohort_nowgd,
symbol='KRAS')
fig = plt.figure(figsize=(10,7))
ax = fig.add_subplot(111)
fig.suptitle('Survival Analysis - {} - KRAS'.format(cancer), fontsize=16, weight='bold')
kmf = KaplanMeierFitter()
for group,i in zip(res_group_list, range(len(res_group_list))):
data = master_KRAS[master_KRAS['tp53_res_group'] == group].dropna(subset=['Overall_Survival_Months', 'Overall_Survival_Status'])
try:
data['Overall Survival Status 0/1'] = data.apply(lambda x: 1 if x['Overall_Survival_Status'] == 'DECEASED' else 0, axis=1)
kmf.fit(np.array(data['Overall_Survival_Months']), event_observed=np.array(data['Overall Survival Status 0/1']), label= group)
kmf.plot_survival_function(color = res_palette_list[i], ax=ax)
except: pass
plt.show()
fig = plt.figure(figsize=(10,7))
ax = fig.add_subplot(111)
fig.suptitle('Survival Analysis - {} - KRAS'.format(cancer), fontsize=16, weight='bold')
kmf = KaplanMeierFitter()
for group,i in zip(group_list, range(len(group_list))):
data = master_KRAS[master_KRAS['tp53_group'] == group].dropna(subset=['Overall_Survival_Months', 'Overall_Survival_Status'])
try:
data['Overall Survival Status 0/1'] = data.apply(lambda x: 1 if x['Overall_Survival_Status'] == 'DECEASED' else 0, axis=1)
kmf.fit(np.array(data['Overall_Survival_Months']), event_observed=np.array(data['Overall Survival Status 0/1']), label= group)
kmf.plot_survival_function(color = palette_list[i], ax=ax)
except: pass
plt.show()
master_CDKN2A = get_master_codrivers(master=master_no_wgd_cancer,
maf=maf_cohort_nowgd,
symbol='CDKN2A')
fig = plt.figure(figsize=(10,7))
ax = fig.add_subplot(111)
fig.suptitle('Survival Analysis - {} - CDKN2A'.format(cancer), fontsize=16, weight='bold')
kmf = KaplanMeierFitter()
for group,i in zip(res_group_list, range(len(res_group_list))):
data = master_CDKN2A[master_CDKN2A['tp53_res_group'] == group].dropna(subset=['Overall_Survival_Months', 'Overall_Survival_Status'])
try:
data['Overall Survival Status 0/1'] = data.apply(lambda x: 1 if x['Overall_Survival_Status'] == 'DECEASED' else 0, axis=1)
kmf.fit(np.array(data['Overall_Survival_Months']), event_observed=np.array(data['Overall Survival Status 0/1']), label= group)
kmf.plot_survival_function(color = res_palette_list[i], ax=ax)
except: pass
plt.show()
fig = plt.figure(figsize=(10,7))
ax = fig.add_subplot(111)
fig.suptitle('Survival Analysis - {} - CDKN2A'.format(cancer), fontsize=16, weight='bold')
kmf = KaplanMeierFitter()
for group,i in zip(group_list, range(len(group_list))):
data = master_CDKN2A[master_CDKN2A['tp53_group'] == group].dropna(subset=['Overall_Survival_Months', 'Overall_Survival_Status'])
try:
data['Overall Survival Status 0/1'] = data.apply(lambda x: 1 if x['Overall_Survival_Status'] == 'DECEASED' else 0, axis=1)
kmf.fit(np.array(data['Overall_Survival_Months']), event_observed=np.array(data['Overall Survival Status 0/1']), label= group)
kmf.plot_survival_function(color = palette_list[i], ax=ax)
except: pass
plt.show()